Load packages
library(data.table) # For faster data manipulation
library(tidyverse) # For data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf) # For spatial data handling
## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
library(leaflet) # For interactive maps
library(leaflet.extras) # For additional leaflet features
library(mapview) # For easier map visualization
library(tmap) # For thematic maps
library(tigris) # For US road networks
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(future) # For parallel processing
library(future.apply) # For parallel processing with apply functions
library(sf) # For spatial data handling
library(RPostgres) # For PostgreSQL connection
library(DBI) # For database interface
# Create directories if they don't exist
if (!dir.exists("./data/tigris")) {
dir.create("./data/tigris", recursive = TRUE)
}
# Set custom cache directory (optional)
options(tigris_cache_dir = "./data/tigris")
# Configure tigris to use caching
options(tigris_use_cache = TRUE)
# Function to create buffers in batches with proper projection
create_buffers_in_batches <- function(sf_object, buffer_dist, batch_size = 500) {
# First, reproject to an appropriate projected CRS for the region
# For California, UTM Zone 10N (EPSG:26910) or 11N (EPSG:26911) works well
# If we're unsure about your region, a web mercator projection (3857) is a reasonable default
message("Reprojecting data to a meter-based CRS...")
sf_object_projected <- st_transform(sf_object, 3310) # Web Mercator
n_features <- nrow(sf_object_projected)
n_batches <- ceiling(n_features / batch_size)
# Create empty list to store results
buffer_list <- vector("list", n_batches)
# Process in batches
for (i in 1:n_batches) {
start_idx <- (i-1) * batch_size + 1
end_idx <- min(i * batch_size, n_features)
cat(sprintf("Processing batch %d of %d (features %d to %d)\n",
i, n_batches, start_idx, end_idx))
# Extract batch
batch <- sf_object_projected[start_idx:end_idx, ]
# Create buffer (with parallel processing within sf)
buffer_list[[i]] <- st_buffer(batch, dist = buffer_dist)
}
# Combine results
result <- do.call(rbind, buffer_list)
# Reproject back to original CRS if needed
message("Reprojecting results back to original CRS...")
st_transform(result, st_crs(sf_object))
}
Load data (filter for California for now to limit data set size)
# Efficient approach
df.acc <- fread("data/us_accidents/US_accidents_March23.csv")[
# Filter date range of 2021
lubridate::year(as.Date(Start_Time)) == 2021 &
# And California
State == "CA"
][, `:=`(
# Add year, quarter, month columns
year = data.table::year(Start_Time),
quarter = data.table::quarter(Start_Time),
month = data.table::month(Start_Time),
# Calculate duration (assuming End_Time exists in the dataset)
duration = as.numeric(difftime(End_Time, Start_Time, units = "days"))
)] %>%
as_tibble() # Convert to tibble only at the end for performance
df.const <- fread("data/us_constructions/US_constructions_Dec21.csv")[
# Filter date range of 2021
lubridate::year(as.Date(Start_Time)) == 2021 &
# And California
State == "CA"
][, `:=`(
# Add year, quarter, month columns
year = year(Start_Time),
quarter = quarter(Start_Time),
month = month(Start_Time),
# Calculate duration (assuming End_Time exists in the dataset)
duration = as.numeric(difftime(End_Time, Start_Time, units = "days"))
)] %>%
as_tibble() # Convert to tibble only at the end for performance
Convert dataframes to sf objects
# Convert accident data to sf object
df.acc.sf <- df.acc %>%
filter(!is.na(Start_Lat) & !is.na(Start_Lng)) %>%
st_as_sf(coords = c("Start_Lng", "Start_Lat"), crs = 4326)
# Convert construction data to sf object
df.const.sf <- df.const %>%
filter(!is.na(Start_Lat) & !is.na(Start_Lng)) %>%
st_as_sf(coords = c("Start_Lng", "Start_Lat"), crs = 4326)
# Get US roads (this can be slow for the entire US, so we might want to limit by state)
# We also only get Primary and Secondary roads as they are the most important (i.e., where most constructions and accidents happen). Otherwise, the dataset would become to large (1.1GB)
ca_roads <- primary_secondary_roads("CA", year = 2021)
# Below code to get ALL roads for California (not recommended due to size)
# First get all counties in California
#ca_counties <- counties("CA", year = 2021)
# Then get roads for each county
#ca_roads_list <- lapply(ca_counties$COUNTYFP, function(county_code) {
# roads("CA", county = county_code, year = 2021)
#})
# Optionally combine all county road data into one object
#ca_roads <- do.call(rbind, ca_roads_list)
# Static map for Accidents with year coloring
# First, store your plot in a variable
ca_plot <- ggplot() +
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
geom_sf(data = df.acc.sf %>% filter(State == "CA"),
aes(color = factor(year)), alpha = 0.7, size = 0.5) +
scale_color_viridis_d(name = "Year") + # Use viridis color palette for discrete values
theme_minimal() +
labs(title = "US Road Accidents by Year (2016-2021)") +
theme(legend.position = "bottom")
print(ca_plot)
# Then save it using ggsave
ggsave(filename = "imgs/california_accidents.png",
plot = ca_plot,
width = 10, # width in inches
height = 8, # height in inches
dpi = 300) # resolution
# Construction sites map with year coloring
# First, store our plot in a variable
ca_const_plot <- ggplot() +
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
geom_sf(data = df.const.sf %>% filter(State == "CA"),
aes(color = factor(year)), alpha = 0.7, size = 0.5) +
scale_color_viridis_d(name = "Year") + # Use viridis color palette for discrete values
theme_minimal() +
labs(title = "US Construction Sites by Year (2016-2021)") +
theme(legend.position = "bottom")
print(ca_const_plot)
# Then save it using ggsave
ggsave(filename = "imgs/california_construction.png",
plot = ca_const_plot,
width = 10, # width in inches
height = 8, # height in inches
dpi = 300) # resolution
# Create heatmap of accidents using ggplot2
# Modified accident heatmap with transparency for low density areas
accident_heatmap <- ggplot() +
# Add California roads as base layer
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
# Create density heatmap with transparency threshold
stat_density_2d(
data = df.acc %>% filter(!is.na(Start_Lat) & !is.na(Start_Lng)),
aes(x = Start_Lng, y = Start_Lat, fill = after_stat(density), alpha = after_stat(density)),
geom = "tile",
contour = FALSE,
h = 0.1,
n = 200
) +
# Set alpha scaling with a minimum threshold
scale_alpha_continuous(range = c(0, 0.9), guide = "none") +
# Use color scheme similar to leaflet's default heatmap
scale_fill_gradientn(
colors = c("#0000FF", "#00FFFF", "#00FF00", "#FFFF00", "#FF0000"),
name = "Accident\nDensity"
) +
coord_sf(
xlim = c(-124.5, -114.5),
ylim = c(32.5, 42.5)
) +
theme_minimal() +
labs(
title = "Accident Density Heatmap in California (2016-2021)",
x = NULL,
y = NULL
) +
theme(
legend.position = "right",
plot.title = element_text(face = "bold")
)
# Display the plot
print(accident_heatmap)
ggsave(filename = "imgs/accident_heatmap.png",
plot = accident_heatmap,
width = 10, # width in inches
height = 8, # height in inches
dpi = 300) # resolution
# Create heatmap of construction sites using ggplot2
construction_heatmap <- ggplot() +
# Add California roads as base layer
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
# Create density heatmap
stat_density_2d(
data = df.const %>% filter(!is.na(Start_Lat) & !is.na(Start_Lng)),
aes(x = Start_Lng, y = Start_Lat, fill = after_stat(density)),
geom = "tile",
contour = FALSE,
alpha = 0.7,
h = 0.1, # Bandwidth
n = 200 # Resolution
) +
# Set alpha scaling with a minimum threshold
scale_alpha_continuous(range = c(0, 0.9), guide = "none") +
# Use color scheme matching your leaflet construction map
scale_fill_gradientn(
colors = c("yellow", "orange", "red"),
name = "Construction\nDensity"
) +
# Set appropriate boundaries for California
coord_sf(
xlim = c(-124.5, -114.5),
ylim = c(32.5, 42.5)
) +
theme_minimal() +
labs(
title = "Construction Density Heatmap in California (2016-2021)",
x = NULL,
y = NULL
) +
theme(
legend.position = "right",
plot.title = element_text(face = "bold")
)
# Display the plot
print(construction_heatmap)
# For interactive maps (often better for large datasets)
# Accidents map
accident_map <- leaflet() %>%
addTiles() %>%
addHeatmap(data = df.acc.sf,
intensity = ~1,
radius = 8,
blur = 10) %>%
setView(lng = -119.4179, lat = 36.7783, zoom = 6) %>% # Center to CA
setMaxBounds(lng1 = -124.6, lat1 = 42.0, # Northwest corner of CA
lng2 = -114.1, lat2 = 32.5) # Southeast corner of CA
# Display maps
accident_map
# Construction sites map with California focus and bounds constraints
construction_map <- leaflet() %>%
addTiles() %>%
addHeatmap(data = df.const.sf,
intensity = ~1,
radius = 8,
blur = 10,
gradient = c("yellow", "orange", "red")) %>%
setView(lng = -119.4179, lat = 36.7783, zoom = 6) %>% # Center on California
setMaxBounds(lng1 = -124.6, lat1 = 42.0, # Northwest corner of CA
lng2 = -114.1, lat2 = 32.5) # Southeast corner of CA
# Display the map
construction_map
We construct construction sites now also as line objects since they have a start and end point
# Convert construction data to linestring sf object
df.const.lines <- df.const %>%
filter(!is.na(Start_Lat) & !is.na(Start_Lng) & !is.na(End_Lat) & !is.na(End_Lng)) %>%
mutate(geometry = pmap(list(Start_Lng, Start_Lat, End_Lng, End_Lat),
function(start_lng, start_lat, end_lng, end_lat) {
coords <- matrix(c(start_lng, start_lat,
end_lng, end_lat),
ncol = 2, byrow = TRUE)
st_linestring(coords)
})) %>%
st_sf(crs = 4326)
Since file sizes are so large and Spatial operations in R incredibly slow, we retreat to a PostgreSQL database for the following operations (PostGIS also uses the SF framework, thus R are almost identical to the following SQL functions used).
# Create a connection to your PostgreSQL database
con <- dbConnect(
Postgres(),
dbname = "geospatial_final",
host = "localhost",
port = 5432,
# Uncomment and add your credentials if needed
user = "postgres",
password = Sys.getenv("POSTGRES_PWD")
)
# Set connection parameters for performance optimization
dbExecute(con, "SET work_mem = '128MB'") # Increase working memory
## [1] 0
dbExecute(con, "SET maintenance_work_mem = '256MB'") # For index creation
## [1] 0
dbExecute(con, "SET random_page_cost = 1.1") # For SSD storage
## [1] 0
dbExecute(con, "SET effective_cache_size = '2GB'") # Estimate of memory available
## [1] 0
dbExecute(con, "SET max_parallel_workers_per_gather = 4") # Use multiple cores
## [1] 0
# Check if PostGIS extension is enabled
postgis_check <- dbGetQuery(con, "SELECT PostGIS_version()")
cat("PostGIS version:", postgis_check[[1]], "\n")
## PostGIS version: 3.5 USE_GEOS=1 USE_PROJ=1 USE_STATS=1
# Check if SRID 3310 is available and add if not
dbExecute(con, "
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM spatial_ref_sys WHERE srid = 3310) THEN
INSERT INTO spatial_ref_sys (srid, auth_name, auth_srid, proj4text, srtext)
VALUES (3310, 'EPSG', 3310,
'+proj=aea +lat_1=34 +lat_2=40.5 +lat_0=0 +lon_0=-120 +x_0=0 +y_0=-4000000 +datum=NAD83 +units=m +no_defs',
'PROJCS[\"NAD83 / California Albers\",GEOGCS[\"NAD83\",DATUM[\"North_American_Datum_1983\",SPHEROID[\"GRS 1980\",6378137,298.257222101,AUTHORITY[\"EPSG\",\"7019\"]],AUTHORITY[\"EPSG\",\"6269\"]],PRIMEM[\"Greenwich\",0,AUTHORITY[\"EPSG\",\"8901\"]],UNIT[\"degree\",0.01745329251994328,AUTHORITY[\"EPSG\",\"9122\"]],AUTHORITY[\"EPSG\",\"4269\"]],PROJECTION[\"Albers_Conic_Equal_Area\"],PARAMETER[\"standard_parallel_1\",34],PARAMETER[\"standard_parallel_2\",40.5],PARAMETER[\"latitude_of_center\",0],PARAMETER[\"longitude_of_center\",-120],PARAMETER[\"false_easting\",0],PARAMETER[\"false_northing\",-4000000],UNIT[\"metre\",1,AUTHORITY[\"EPSG\",\"9001\"]],AUTHORITY[\"EPSG\",\"3310\"]]');
END IF;
END
$$;
")
## [1] 0
# Check if tables exist and only create them if they don't
# Begin transaction for data processing
dbExecute(con, "BEGIN")
## [1] 0
# Check if accidents table exists, if not create it
has_accidents <- dbGetQuery(con, "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'accidents')")$exists
if (!has_accidents) {
# Write accidents data to database
sf::st_write(df.acc.sf, con, "accidents",
layer_options = c("GEOM_TYPE=GEOMETRY",
"GEOMETRY_NAME=geometry",
"SRID=4326"))
cat("Written accidents data to database\n")
# Create spatial index for accidents
dbExecute(con, "CREATE INDEX idx_accidents_geometry ON accidents USING GIST(geometry)")
dbExecute(con, 'CREATE INDEX idx_accidents_start_time ON accidents("Start_Time");')
dbExecute(con, "ANALYZE accidents")
cat("Created spatial index for accidents table\n")
}
# Check if construction table exists, if not create it
has_construction <- dbGetQuery(con, "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'construction')")$exists
if (!has_construction) {
# Write construction data to database
sf::st_write(df.const.sf, con, "construction",
layer_options = c("GEOM_TYPE=GEOMETRY",
"GEOMETRY_NAME=geometry",
"SRID=4326"))
cat("Written construction data to database\n")
# Create spatial index for construction
dbExecute(con, "CREATE INDEX idx_construction_geometry ON construction USING GIST(geometry)")
dbExecute(con, 'CREATE INDEX idx_construction_start_time ON construction("Start_Time");')
dbExecute(con, 'CREATE INDEX idx_construction_end_time ON construction("End_Time");')
dbExecute(con, "ANALYZE construction")
cat("Created spatial index for construction table\n")
}
# Check if construction table with lines exists, if not create it
has_construction_lines <- dbGetQuery(con, "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'construction_lines')")$exists
if (!has_construction_lines) {
# Write construction data to database
sf::st_write(df.const.lines, con, "construction_lines",
layer_options = c("GEOM_TYPE=GEOMETRY",
"GEOMETRY_NAME=geometry",
"SRID=4326"))
cat("Written construction_lines data to database\n")
# Create spatial index for construction
dbExecute(con, "CREATE INDEX idx_construction_lines_geometry ON construction_lines USING GIST(geometry)")
dbExecute(con, 'CREATE INDEX idx_construction_lines_start_time ON construction_lines("Start_Time");')
dbExecute(con, 'CREATE INDEX idx_construction_lines_end_time ON construction_lines("End_Time");')
dbExecute(con, "ANALYZE construction_lines")
cat("Created spatial index for construction_lines table\n")
}
# Commit transaction for base tables
dbExecute(con, "COMMIT")
## [1] 0
#---------------------------------------------------------------------------
# STEP 1A: Create enhanced construction buffers table with time ranges
#---------------------------------------------------------------------------
has_construction_buffers <- dbGetQuery(con, "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'construction_buffers')")$exists
if (!has_construction_buffers) {
cat("Step 1A: Creating enhanced construction_buffers table with time ranges...\n")
dbExecute(con, "BEGIN")
# Create construction buffers table with appropriate projection and time ranges
dbExecute(con, "
CREATE TABLE construction_buffers AS
SELECT
\"ID\" as construction_id,
\"Start_Time\" as start_time,
\"End_Time\" as end_time,
ST_Transform(
ST_Buffer(
ST_Transform(geometry, 3310), -- Transform to California Albers for accurate distance
300 -- 300 meters buffer
),
4326 -- Transform back to WGS 84 for joining
) AS geometry
FROM construction
")
# Add primary key and spatial index
dbExecute(con, "ALTER TABLE construction_buffers ADD PRIMARY KEY (construction_id)")
dbExecute(con, "CREATE INDEX idx_construction_buffers_geometry ON construction_buffers USING GIST(geometry)")
dbExecute(con, "CREATE INDEX idx_construction_buffers_times ON construction_buffers(start_time, end_time)")
dbExecute(con, "ANALYZE construction_buffers")
dbExecute(con, "COMMIT")
cat("Created enhanced construction_buffers table with spatial index and time ranges\n")
} else {
cat("Construction buffers table already exists, skipping creation\n")
}
## Construction buffers table already exists, skipping creation
#---------------------------------------------------------------------------
# STEP 1B: Create spatial matches table (using enhanced pre-computed buffers)
#---------------------------------------------------------------------------
has_spatial_matches <- dbGetQuery(con, "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'spatial_matches')")$exists
if (!has_spatial_matches) {
cat("Step 1B: Creating spatial_matches table using enhanced pre-computed buffers...\n")
dbExecute(con, "BEGIN")
# Simplified join using pre-computed buffers with time information
dbExecute(con, "
CREATE TABLE spatial_matches AS
SELECT
a.\"ID\" AS accident_id,
a.\"Severity\" AS accident_severity,
a.\"Start_Time\" AS accident_time,
b.construction_id
FROM
accidents a
JOIN
construction_buffers b ON
ST_Within(a.geometry, b.geometry) AND
a.\"Start_Time\" BETWEEN b.start_time AND b.end_time
")
# Create indices for efficient join operations in later steps
dbExecute(con, "CREATE INDEX idx_spatial_accident ON spatial_matches(accident_id)")
dbExecute(con, "CREATE INDEX idx_spatial_construction ON spatial_matches(construction_id)")
dbExecute(con, "ANALYZE spatial_matches")
dbExecute(con, "COMMIT")
cat("Created spatial_matches table with combined temporal and spatial filtering\n")
} else {
cat("Spatial matches table already exists, skipping creation\n")
}
## Spatial matches table already exists, skipping creation
#---------------------------------------------------------------------------
# STEP 2: Count construction sites per accident (unchanged)
#---------------------------------------------------------------------------
has_construction_counts <- dbGetQuery(con, "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'construction_counts')")$exists
if (!has_construction_counts) {
cat("Step 2: Creating construction_counts table...\n")
dbExecute(con, "BEGIN")
dbExecute(con, "
CREATE TABLE construction_counts AS
SELECT
accident_id,
COUNT(*) AS nearby_construction_count
FROM
spatial_matches
GROUP BY
accident_id
")
# Create index for efficient joins in the final step
dbExecute(con, "CREATE INDEX idx_construction_counts_accident ON construction_counts(accident_id)")
dbExecute(con, "ANALYZE construction_counts")
dbExecute(con, "COMMIT")
cat("Created construction_counts table\n")
} else {
cat("Construction counts table already exists, skipping creation\n")
}
## Construction counts table already exists, skipping creation
#---------------------------------------------------------------------------
# STEP 3: Create final analysis table (unchanged)
#---------------------------------------------------------------------------
has_accident_construction_analysis <- dbGetQuery(con, "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'accident_construction_analysis')")$exists
if (!has_accident_construction_analysis) {
cat("Step 3: Creating final accident_construction_analysis table...\n")
dbExecute(con, "BEGIN")
dbExecute(con, "
CREATE TABLE accident_construction_analysis AS
WITH ranked_matches AS (
SELECT
sm.accident_id,
sm.construction_id,
sm.accident_severity,
sm.accident_time,
cc.nearby_construction_count,
-- Rank construction sites for each accident (random selection if multiple)
ROW_NUMBER() OVER (
PARTITION BY sm.accident_id
ORDER BY RANDOM() -- Random selection ensures unbiased assignment
) AS rank
FROM
spatial_matches sm
JOIN
construction_counts cc ON sm.accident_id = cc.accident_id
)
SELECT
accident_id,
construction_id,
'LINE' AS construction_type, -- Indicates we're working with line geometries only
accident_severity, -- For severity analysis
accident_time, -- For temporal analysis
1 AS temporal_overlap, -- Always 1 due to our temporal filter in Step 1
nearby_construction_count, -- Number of construction sites affecting this accident
1 AS is_near_active_construction -- Always 1 due to our filtering steps
FROM
ranked_matches
WHERE
rank = 1 -- Select only one construction site per accident
")
# Create indices for query optimization
dbExecute(con, "CREATE INDEX idx_aca_accident_id ON accident_construction_analysis(accident_id)")
dbExecute(con, "CREATE INDEX idx_aca_construction_id ON accident_construction_analysis(construction_id)")
dbExecute(con, "CREATE INDEX idx_aca_count ON accident_construction_analysis(nearby_construction_count)")
dbExecute(con, "ANALYZE accident_construction_analysis")
dbExecute(con, "COMMIT")
cat("Created accident_construction_analysis table\n")
} else {
cat("Accident construction analysis table already exists, skipping creation\n")
}
## Accident construction analysis table already exists, skipping creation
# Read the data for construction zones with accident counts (for visualization)
accidents_per_zone <- st_read(con, query = "
WITH construction_accident_counts AS (
SELECT
aca.construction_id,
COUNT(aca.accident_id) AS accident_count
FROM
accident_construction_analysis aca
GROUP BY
aca.construction_id
)
SELECT
cl.\"ID\" AS construction_id,
cac.accident_count,
cl.geometry
FROM
construction_lines cl
JOIN
construction_accident_counts cac ON cl.\"ID\" = cac.construction_id
ORDER BY
cac.accident_count DESC
")
# Get statistics on construction zones
cat("Construction zones with accidents:", nrow(accidents_per_zone), "\n")
## Construction zones with accidents: 10088
cat("Maximum accidents in a single construction zone:", max(accidents_per_zone$accident_count), "\n")
## Maximum accidents in a single construction zone: 7.163952e-322
cat("Average accidents per construction zone:", round(mean(accidents_per_zone$accident_count), 2), "\n")
## Average accidents per construction zone: 2.470328e-323
# Read accidents with their associated construction data
accidents_with_construction <- st_read(con, query = "
SELECT
a.*,
CASE WHEN aca.accident_id IS NOT NULL THEN 1 ELSE 0 END AS near_construction,
aca.construction_id,
aca.nearby_construction_count,
aca.construction_type
FROM
accidents a
LEFT JOIN
accident_construction_analysis aca ON a.\"ID\" = aca.accident_id
")
# Get distribution of accident counts by construction density
construction_density_stats <- dbGetQuery(con, "
SELECT
nearby_construction_count,
COUNT(*) AS accident_count
FROM
accident_construction_analysis
GROUP BY
nearby_construction_count
ORDER BY
nearby_construction_count
")
#---------------------------------------------------------------------------
# Report completion and verify table sizes
#---------------------------------------------------------------------------
cat("All tables are ready for analysis\n")
## All tables are ready for analysis
print(construction_density_stats)
## nearby_construction_count accident_count
## 1 1 32107
## 2 2 11554
## 3 3 5312
## 4 4 2771
## 5 5 1809
## 6 6 1109
## 7 7 789
## 8 8 538
## 9 9 493
## 10 10 322
## 11 11 199
## 12 12 239
## 13 13 185
## 14 14 144
## 15 15 113
## 16 16 69
## 17 17 58
## 18 18 82
## 19 19 105
## 20 20 76
## 21 21 78
## 22 22 56
## 23 23 38
## 24 24 82
## 25 25 32
## 26 26 98
## 27 27 29
## 28 28 27
## 29 29 50
## 30 30 38
## 31 31 36
## 32 32 14
## 33 33 39
## 34 34 36
## 35 35 25
## 36 36 27
## 37 37 12
## 38 38 8
## 39 39 18
## 40 40 12
## 41 41 14
## 42 42 14
## 43 43 16
## 44 44 5
## 45 45 11
## 46 46 10
## 47 47 8
## 48 48 12
## 49 49 4
## 50 50 5
## 51 51 5
## 52 52 4
## 53 53 2
## 54 54 8
## 55 55 8
## 56 56 2
## 57 57 2
## 58 58 4
## 59 59 1
## 60 60 5
## 61 61 10
## 62 62 8
## 63 63 1
## 64 64 2
## 65 65 12
## 66 66 5
## 67 67 2
## 68 68 5
## 69 69 4
## 70 70 2
## 71 71 6
## 72 72 3
## 73 73 1
## 74 74 1
## 75 75 2
## 76 76 3
## 77 77 3
## 78 78 7
## 79 79 5
## 80 80 3
## 81 81 5
## 82 82 3
## 83 83 8
## 84 84 1
## 85 85 5
## 86 86 3
## 87 87 3
## 88 89 2
## 89 96 2
## 90 118 1
## 91 147 1
## 92 185 1
# Total accidents summary - for comparison
total_accidents <- dbGetQuery(con, "SELECT COUNT(*) as count FROM accidents")$count
accidents_near_construction <- dbGetQuery(con, "SELECT COUNT(*) as count FROM accident_construction_analysis")$count
cat("Total accidents in dataset:", total_accidents, "\n")
## Total accidents in dataset: 1.689092e-318
cat("Accidents near active construction:", accidents_near_construction, "\n")
## Accidents near active construction: 2.919384e-319
cat("Percentage of accidents near active construction: ",
round(accidents_near_construction/total_accidents*100, 2), "%\n")
## Percentage of accidents near active construction: 17.28 %
#---------------------------------------------------------------------------
# STEP 1A-L: Create construction line buffers table with time ranges
#---------------------------------------------------------------------------
has_construction_line_buffers <- dbGetQuery(con, "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'construction_line_buffers')")$exists
if (!has_construction_line_buffers) {
cat("Step 1A-L: Creating construction_line_buffers table with time ranges...\n")
dbExecute(con, "BEGIN")
# Create line-based construction buffers with time information
dbExecute(con, "
CREATE TABLE construction_line_buffers AS
SELECT
\"ID\" as construction_id,
\"Start_Time\" as start_time,
\"End_Time\" as end_time,
ST_Transform(
ST_Buffer(
ST_Transform(geometry, 3310), -- Transform to California Albers for accurate distance
300 -- 300 meters buffer
),
4326 -- Transform back to WGS 84 for joining
) AS geometry,
ST_Length(ST_Transform(geometry, 3310)) AS segment_length_m -- Length in meters
FROM construction_lines
")
# Add primary key and spatial index
dbExecute(con, "ALTER TABLE construction_line_buffers ADD PRIMARY KEY (construction_id)")
dbExecute(con, "CREATE INDEX idx_construction_line_buffers_geometry ON construction_line_buffers USING GIST(geometry)")
dbExecute(con, "CREATE INDEX idx_construction_line_buffers_times ON construction_line_buffers(start_time, end_time)")
dbExecute(con, "ANALYZE construction_line_buffers")
dbExecute(con, "COMMIT")
cat("Created construction_line_buffers table with spatial index and time ranges\n")
} else {
cat("Construction line buffers table already exists, skipping creation\n")
}
## Construction line buffers table already exists, skipping creation
#---------------------------------------------------------------------------
# STEP 1B-L: Create spatial matches table for line objects
#---------------------------------------------------------------------------
has_line_spatial_matches <- dbGetQuery(con, "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'line_spatial_matches')")$exists
if (!has_line_spatial_matches) {
cat("Step 1B-L: Creating line_spatial_matches table using pre-computed line buffers...\n")
dbExecute(con, "BEGIN")
# Simplified join using pre-computed line buffers with time information
dbExecute(con, "
CREATE TABLE line_spatial_matches AS
SELECT
a.\"ID\" AS accident_id,
a.\"Severity\" AS accident_severity,
a.\"Start_Time\" AS accident_time,
b.construction_id,
b.segment_length_m
FROM
accidents a
JOIN
construction_line_buffers b ON
ST_Within(a.geometry, b.geometry) AND
a.\"Start_Time\" BETWEEN b.start_time AND b.end_time
")
# Create indices for efficient join operations
dbExecute(con, "CREATE INDEX idx_line_spatial_accident ON line_spatial_matches(accident_id)")
dbExecute(con, "CREATE INDEX idx_line_spatial_construction ON line_spatial_matches(construction_id)")
dbExecute(con, "ANALYZE line_spatial_matches")
dbExecute(con, "COMMIT")
cat("Created line_spatial_matches table with combined temporal and spatial filtering\n")
} else {
cat("Line spatial matches table already exists, skipping creation\n")
}
## Line spatial matches table already exists, skipping creation
#---------------------------------------------------------------------------
# STEP 2-L: Count construction line sites per accident
#---------------------------------------------------------------------------
has_line_construction_counts <- dbGetQuery(con, "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'line_construction_counts')")$exists
if (!has_line_construction_counts) {
cat("Step 2-L: Creating line_construction_counts table...\n")
dbExecute(con, "BEGIN")
dbExecute(con, "
CREATE TABLE line_construction_counts AS
SELECT
accident_id,
COUNT(*) AS nearby_line_construction_count
FROM
line_spatial_matches
GROUP BY
accident_id
")
# Create index for efficient joins in the final step
dbExecute(con, "CREATE INDEX idx_line_construction_counts_accident ON line_construction_counts(accident_id)")
dbExecute(con, "ANALYZE line_construction_counts")
dbExecute(con, "COMMIT")
cat("Created line_construction_counts table\n")
} else {
cat("Line construction counts table already exists, skipping creation\n")
}
## Line construction counts table already exists, skipping creation
#---------------------------------------------------------------------------
# STEP 3-L: Create final line-based analysis table
#---------------------------------------------------------------------------
has_line_accident_construction_analysis <- dbGetQuery(con, "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'line_accident_construction_analysis')")$exists
if (!has_line_accident_construction_analysis) {
cat("Step 3-L: Creating final line_accident_construction_analysis table...\n")
dbExecute(con, "BEGIN")
dbExecute(con, "
CREATE TABLE line_accident_construction_analysis AS
WITH ranked_matches AS (
SELECT
sm.accident_id,
sm.construction_id,
sm.accident_severity,
sm.accident_time,
sm.segment_length_m,
cc.nearby_line_construction_count,
-- Rank construction sites for each accident (random selection if multiple)
ROW_NUMBER() OVER (
PARTITION BY sm.accident_id
ORDER BY RANDOM() -- Random selection ensures unbiased assignment
) AS rank
FROM
line_spatial_matches sm
JOIN
line_construction_counts cc ON sm.accident_id = cc.accident_id
)
SELECT
accident_id,
construction_id,
accident_severity,
accident_time,
segment_length_m,
nearby_line_construction_count,
1 AS temporal_overlap, -- Always 1 due to our temporal filter in Step 1
1 AS is_near_active_construction -- Always 1 due to our filtering steps
FROM
ranked_matches
WHERE
rank = 1 -- Select only one construction site per accident
")
# Create indices for query optimization
dbExecute(con, "CREATE INDEX idx_laca_accident_id ON line_accident_construction_analysis(accident_id)")
dbExecute(con, "CREATE INDEX idx_laca_construction_id ON line_accident_construction_analysis(construction_id)")
dbExecute(con, "CREATE INDEX idx_laca_count ON line_accident_construction_analysis(nearby_line_construction_count)")
dbExecute(con, "ANALYZE line_accident_construction_analysis")
dbExecute(con, "COMMIT")
cat("Created line_accident_construction_analysis table\n")
} else {
cat("Line accident construction analysis table already exists, skipping creation\n")
}
## Line accident construction analysis table already exists, skipping creation
#---------------------------------------------------------------------------
# STEP 4-L: Calculate accident rates by segment length
#---------------------------------------------------------------------------
has_accident_rates <- dbGetQuery(con, "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'construction_segment_accident_rates')")$exists
if (!has_accident_rates) {
cat("Step 4-L: Creating construction_segment_accident_rates table...\n")
dbExecute(con, "BEGIN")
dbExecute(con, "
CREATE TABLE construction_segment_accident_rates AS
SELECT
cl.\"ID\" AS construction_id,
cl.geometry,
COUNT(laca.accident_id) AS accident_count,
MAX(laca.segment_length_m) AS segment_length_m,
COUNT(laca.accident_id) / (MAX(laca.segment_length_m) / 1000.0) AS accidents_per_km
FROM
construction_lines cl
LEFT JOIN
line_accident_construction_analysis laca ON cl.\"ID\" = laca.construction_id
GROUP BY
cl.\"ID\", cl.geometry
")
# Create spatial index
dbExecute(con, "CREATE INDEX idx_accident_rates_geometry ON construction_segment_accident_rates USING GIST(geometry)")
dbExecute(con, "ANALYZE construction_segment_accident_rates")
dbExecute(con, "COMMIT")
cat("Created construction_segment_accident_rates table\n")
} else {
cat("Construction segment accident rates table already exists, skipping creation\n")
}
## Construction segment accident rates table already exists, skipping creation
# Read accident rates by construction segment
accident_rates_by_segment <- st_read(con, query = "
SELECT
construction_id,
accident_count,
segment_length_m,
accidents_per_km,
geometry
FROM
construction_segment_accident_rates
WHERE
segment_length_m > 0 -- Avoid division by zero
ORDER BY
accidents_per_km DESC
")
# Display summary statistics
cat("Construction segments analyzed:", nrow(accident_rates_by_segment), "\n")
## Construction segments analyzed: 12280
cat("Maximum accidents per km:", max(accident_rates_by_segment$accidents_per_km, na.rm = TRUE), "\n")
## Maximum accidents per km: 4167.822
cat("Average accidents per km:", mean(accident_rates_by_segment$accidents_per_km, na.rm = TRUE), "\n")
## Average accidents per km: 19.19031
cat("Median segment length (m):", median(accident_rates_by_segment$segment_length_m, na.rm = TRUE), "\n")
## Median segment length (m): 887.6822
# Severity analysis near line construction
severity_by_construction <- dbGetQuery(con, "
SELECT
laca.accident_severity,
COUNT(*) AS count
FROM
line_accident_construction_analysis laca
GROUP BY
laca.accident_severity
ORDER BY
laca.accident_severity
")
# Comparison of accident severity near vs. away from construction
severity_comparison <- dbGetQuery(con, "
WITH construction_accidents AS (
SELECT
a.\"ID\",
a.\"Severity\",
'Near construction' AS location
FROM
accidents a
JOIN
line_accident_construction_analysis laca ON a.\"ID\" = laca.accident_id
),
non_construction_accidents AS (
SELECT
a.\"ID\",
a.\"Severity\",
'Away from construction' AS location
FROM
accidents a
LEFT JOIN
line_accident_construction_analysis laca ON a.\"ID\" = laca.accident_id
WHERE
laca.accident_id IS NULL
)
SELECT
\"Severity\",
location,
COUNT(*) AS count
FROM (
SELECT * FROM construction_accidents
UNION ALL
SELECT * FROM non_construction_accidents
) combined
GROUP BY
\"Severity\", location
ORDER BY
\"Severity\", location
")
# Time-based analysis
temporal_analysis <- dbGetQuery(con, "
SELECT
DATE_TRUNC('month', laca.accident_time) AS month,
COUNT(*) AS accident_count
FROM
line_accident_construction_analysis laca
GROUP BY
DATE_TRUNC('month', laca.accident_time)
ORDER BY
month
")
# Disconnect from database
dbDisconnect(con)
In theory, the commented code below would achieve the same as our operation PostGIS but way slower. We will not run it here, but keep it to demonstrate that we could implement in R as well.
# Set up parallel processing
#future::plan(future::multisession, workers = parallel::detectCores() - 1)
# Set sf to use parallel processing
#sf_use_s2(FALSE) # Disable S2 spherical geometry
#cores <- parallel::detectCores() - 1
# Process each dataset with batching
#cat("Creating point buffers...\n")
#const_buffers <- create_buffers_in_batches(df.const.sf, buffer_dist = 1000, batch_size = 500)
# Compute accidents within construction zones
#accidents_in_construction <- df.acc.sf %>%
# st_join(const_buffers, join = st_within, prepared = TRUE)
# Return to sequential processing
#future::plan(future::sequential)
#sf_use_s2(TRUE) # Re-enable S2 spherical geometry
# Count accidents per construction zone
#accidents_per_zone <- accidents_with_construction %>%
# group_by(ID) %>% # Grouping by the ID column
# summarize(accident_count = n())
accidents_per_zone <- accidents_per_zone %>% drop_na()
# Visualize construction zones with accident counts
# First, store our plot in a variable
accident_zones_plot <- ggplot() +
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
# Arrange data in descending order so high frequency zones are plotted on top
geom_sf(data = accidents_per_zone %>% arrange(accident_count),
aes(color = accident_count), size = 1.5) +
scale_color_viridis_c() +
theme_minimal() +
labs(title = "Accident Counts within Construction Zones (2021)")
print(accident_zones_plot)
# Then save it using ggsave
ggsave(filename = "imgs/accident_counts_construction_zones.png",
plot = accident_zones_plot,
width = 10, # width in inches
height = 8, # height in inches
dpi = 300) # resolution
# Static map with construction line segments
ggplot() +
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
geom_sf(data = df.const.lines %>% filter(State == "CA"),
color = "orange", size = 1) +
theme_minimal() +
labs(title = "US Construction Segments (2021)")
# Interactive map with construction segments focused on California
construction_segment_map <- leaflet() %>%
addTiles() %>%
addPolylines(data = df.const.lines,
color = "orange",
weight = 3,
opacity = 0.7,
popup = ~paste("ID:", ID, "<br>Duration:", duration)) %>%
setView(lng = -119.4179, lat = 36.7783, zoom = 6) %>% # Center on California
setMaxBounds(lng1 = -124.6, lat1 = 42.0, # Northwest corner of CA
lng2 = -114.1, lat2 = 32.5) # Southeast corner of CA
# Display the map
construction_segment_map